From d87288573e19a7aca802d172e80bbafbf692dc71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anton=20Luka=20=C5=A0ijanec?= Date: Mon, 7 Nov 2022 12:43:33 +0100 Subject: it's possible that thumbnailURL does not exist (acsm id 217679) --- gather.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/gather.py b/gather.py index 8ce676e..d4beec6 100755 --- a/gather.py +++ b/gather.py @@ -10,7 +10,7 @@ try: except ModuleNotFoundError: raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy") try: - from bs4 import BeautifulSoup + from bs4 import BeautifulSoup, FeatureNotFound except ModuleNotFoundError: raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4") @@ -29,7 +29,7 @@ class Book(Base): creator = Column(String, nullable=True, doc="author of the book, dc:creator in acsm") publisher = Column(String, nullable=True, doc="publisher of the book, dc:publisher in acsm") identifier = Column(String, nullable=True, doc="if dc:identifier can't be derived from isbn, it's stored here. if dc:identifier element is missing, a literal string noidentifier is stored.") - thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png") + thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png, may be None if there's no thumbnailURL element") format = Column(String, nullable=True, doc="format of the file. I've seen application/pdf and application/epub+zip") language = Column(String, nullable=True, doc="language of the book. I've seen sl.") borrows = relationship("Borrow", back_populates="book"); @@ -106,7 +106,10 @@ try: force_acsm_id = acsm_id+1 failed_acsms += 1 else: - acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8") + try: + acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8") + except FeatureNotFound: + raise FeatureNotFound("pip3 install lxml") ft = acsm.fulfillmentToken expected = f"ACS-BIBL-L-{acsm_id}" if ft.transaction.string != expected: @@ -127,9 +130,14 @@ try: raise ValueError(f"expected {expected} in ft.resourceItemInfo.licenseToken.resource.string but instead received {ft.resourceItemInfo.licenseToken.resource.string} in acsm {acsm_id}") uuid = expected.split(":").pop() expected = f"https://cs.alliance.inkbook.eu/books/{uuid}." - if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True: - raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}") - thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop() + try: + if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True: + raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}") + thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop() + except AttributeError: + thumbnail_extension = None + if ft.resourceItemInfo.metadata.thumbnailURL != None: + raise ValueError(f"thumbnailURL actually exists, but it failed to be parsed in acsm {acsm_id}") duration = int(ft.resourceItemInfo.licenseToken.permissions.display.duration.string) if duration != int(ft.resourceItemInfo.licenseToken.permissions.play.duration.string): raise ValueError(f"expected {duration} in fr.int(resourceItemInfo.licenseToken.permissions.play.duration.string) but instead received {int(resourceItemInfo.licenseToken.permissions.play.duration.string)} in acsm {acsm_id}") -- cgit v1.2.3